The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.

Dependencies

Set user defined parameters

print(params)
## $autoKfold
## [1] FALSE
## 
## $run
## [1] TRUE
## 
## $save_figs
## [1] FALSE
## 
## $trimAnomalies
## [1] TRUE
## 
## $ecoregion
## [1] "CONUS"
## 
## $response
## [1] "BareGroundCover"
## 
## $removeTexasLouisianaPlain
## [1] FALSE
## 
## $removeAllAnoms
## [1] FALSE
## 
## $whichSecondBestMod
## [1] "auto"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
response <- params$response
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
trimAnom <- params$trimAnomalies
removeTLP <- params$removeTexasLouisianaPlain
run <- params$run
autoKfold <- params$autoKfold
removeAllAnoms <- params$removeAllAnoms
whichSecondBestMod <- params$whichSecondBestMod

Load packages and source functions

# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/betaLASSO.R")

#source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
 
library(betareg)
library(ggspatial)
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(patchwork) # for figure insets etc. 
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
library(here)
library(rsample)
library(kableExtra)
library(glmnet)
library(USA.state.boundaries)

Read in data

Data compiled in the prepDataForModels.R script

here::i_am("Analysis/VegComposition/ModelFitting/03_modelFitting_testingBetaLASSO.Rmd")

# get data that already have 
modDat <- readRDS( here("Data_processed", "CoverData", "CoverModelData_withTreeNoTreePreds.rds")) %>% st_drop_geometry()

# change Level 1 cover variables to be between 0 and 1 rather than 0 and 100
modDat[,"TotalTreeCover"] <- modDat[,"TotalTreeCover"]/100
modDat[,"TotalHerbaceousCover"] <- modDat[,"TotalHerbaceousCover"]/100
modDat[,"BareGroundCover"] <- modDat[,"BareGroundCover"]/100
modDat[,"ShrubCover"] <- modDat[,"ShrubCover"]/100

# For all response variables, make sure there are no 0s add  .0001 from each, since the Beta model framework can't handle that
modDat[modDat$TotalTreeCover == 0 & !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.0001
modDat[modDat$TotalHerbaceousCover == 0  & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.0001
modDat[modDat$BareGroundCover == 0 & !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.0001
modDat[modDat$ShrubCover == 0 & !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.0001
modDat[modDat$AngioTreeCover_prop == 0 & !is.na(modDat$AngioTreeCover_prop), "AngioTreeCover_prop"] <- 0.0001
modDat[modDat$ConifTreeCover_prop == 0 & !is.na(modDat$ConifTreeCover_prop), "ConifTreeCover_prop"] <- 0.0001
modDat[modDat$C4GramCover_prop == 0 & !is.na(modDat$C4GramCover_prop), "C4GramCover_prop"] <- 0.0001
modDat[modDat$C3GramCover_prop == 0 & !is.na(modDat$C3GramCover_prop), "C3GramCover_prop"] <- 0.0001
modDat[modDat$ForbCover_prop == 0 & !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.0001

# For all response variables, make sure there are no values of 1 -- subtract .0001 from each, since the Beta model framework can't handle that
modDat[modDat$TotalTreeCover == 1 & !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.9999
modDat[modDat$TotalHerbaceousCover == 1  & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.9999
modDat[modDat$BareGroundCover == 1 & !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.9999
modDat[modDat$ShrubCover == 1 & !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.9999
modDat[modDat$AngioTreeCover_prop == 1 & !is.na(modDat$AngioTreeCover_prop), "AngioTreeCover_prop"] <- 0.9999
modDat[modDat$ConifTreeCover_prop == 1 & !is.na(modDat$ConifTreeCover_prop), "ConifTreeCover_prop"] <- 0.9999
modDat[modDat$C4GramCover_prop == 1 & !is.na(modDat$C4GramCover_prop), "C4GramCover_prop"] <- 0.9999
modDat[modDat$C3GramCover_prop == 1 & !is.na(modDat$C3GramCover_prop), "C3GramCover_prop"] <- 0.9999
modDat[modDat$ForbCover_prop == 1 & !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.9999

Prep data

Identify the ecoregion and response variable type to use in this model run (i.e. trees or no trees)

ecoregion <- params$ecoregion
response <- params$response
print(paste0("In this model run, the ecoregion is ", ecoregion," and the response variable is ",response))
## [1] "In this model run, the ecoregion is CONUS and the response variable is BareGroundCover"

Visualize the predictor variables

The following are the candidate predictor variables for this ecoregion:

if (ecoregion == "noTrees") {
  # select potential predictor variables for the ecoregion of interest
        prednames <-
          c(
"tmean"         , "prcp"              , "prcp_dry"      , "prcp_seasonality"      ,"prcpTempCorr"     ,"isothermality" ,       
"clay"          , "coarse"            , "carbon"        , "AWHC"                  ,"tmin_anom"        ,"tmax_anom"   ,         
"t_warm_anom"   , "prcp_wet_anom"     , "precp_dry_anom", "prcp_seasonality_anom" ,"prcpTempCorr_anom","isothermality_anom",   
"annWatDef_anom", "annWetDegDays_anom", "VPD_min_anom"  , "frostFreeDays_anom")
  
} else if (ecoregion == "trees") {
  # select potential predictor variables for the ecoregion of interest
  prednames <- 
    c(
"tmean"             , "prcp"          , "prcp_dry"          , "prcpTempCorr"          ,"isothermality"    , "annWatDef" ,           
"sand"              , "coarse"        , "carbon"            , "AWHC"                  ,"tmin_anom"        , "tmax_anom" ,       
"t_warm_anom"       , "prcp_wet_anom" , "precp_dry_anom"    , "prcp_seasonality_anom" ,"prcpTempCorr_anom", "aboveFreezingMonth_anom",
"isothermality_anom", "annWatDef_anom", "annWetDegDays_anom", "VPD_min_anom"  
)
} else if (ecoregion == "CONUS") {
  prednames <- c(
"tmean"         , "prcp"              , "prcp_seasonality"    , "prcpTempCorr"      , "isothermality"           ,"annWetDegDays" ,         
"sand"          , "coarse"            , "AWHC"                , "tmin_anom"         , "tmax_anom"               ,"prcp_anom",         
"t_warm_anom"   , "precp_dry_anom"    , "prcp_seasonality_anom", "prcpTempCorr_anom",  "aboveFreezingMonth_anom", "isothermality_anom",
"annWatDef_anom", "annWetDegDays_anom", "VPD_min_anom"           
  )
} 

Scale the predictor variables for the model-fitting process

modDat_1 <- modDat 
allPreds <- modDat_1 %>% 
  dplyr::select(tmin:frostFreeDays,tmean_anom:frostFreeDays_anom, soilDepth:AWHC) %>% 
  names()
modDat_1_s <- modDat_1 %>% 
  mutate(across(all_of(allPreds), base::scale, .names = "{.col}_s")) 
saveRDS(modDat_1_s, file = "./models/scaledModelInputData.rds")

Subset the data to only include data for the ecoregion of interest based on classification from yes/no tree model (using bestLambda model)

if (ecoregion == "noTrees") {
  # select data for the ecoregion of interest
  modDat_1_s <- modDat_1_s %>%
    filter(treeProb_bestLambda_binary == 0)
} else if (ecoregion == "trees") {
  # select data for the ecoregion of interest
  modDat_1_s <- modDat_1_s %>%
    filter(treeProb_bestLambda_binary == 1)
} else if (ecoregion == "CONUS") {
  modDat_1_s <- modDat_1_s
}
# remove the rows that have no observations for the response variable of interest
modDat_1_s <- modDat_1_s[!is.na(modDat_1_s[,response]),]
# subset the data to only include these predictors, and remove any remaining NAs 
modDat_1_s <- modDat_1_s %>% 
  dplyr::select(prednames, paste0(prednames, "_s"), response, newRegion, Year, x, y, NA_L1NAME, NA_L2NAME) %>% 
  drop_na()

names(prednames) <- prednames
df_pred <- modDat_1_s[, prednames]

Visualize the response variable

hist(modDat_1_s[,response], main = paste0("Histogram of ",response),
     xlab = paste0(response))

create_summary <- function(df) {
  df %>% 
    pivot_longer(cols = everything(),
                 names_to = 'variable') %>% 
    group_by(variable) %>% 
    summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), 
                                        median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>% 
    mutate(across(where(is.numeric), round, 4))
}

modDat_1_s[prednames] %>% 
  create_summary() %>% 
  knitr::kable(caption = 'summaries of possible predictor variables') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
summaries of possible predictor variables
variable value_mean value_min value_median value_max
AWHC 14.7135 0.0000 14.1284 35.2881
VPD_min_anom -0.0147 -0.1740 -0.0165 0.1533
aboveFreezingMonth_anom 0.0510 -2.7826 0.0242 3.0161
annWatDef_anom -0.0535 -4.4040 -0.0521 1.0000
annWetDegDays 1778.2205 81.2108 1552.4749 6800.4486
annWetDegDays_anom 0.0108 -1.8263 0.0052 1.0000
coarse 9.3188 0.0000 5.5749 79.9649
isothermality 37.7035 23.3000 37.6135 63.7425
isothermality_anom 0.4880 -14.9590 0.4334 12.1105
prcp 452.0553 47.6797 391.2113 3939.2400
prcpTempCorr 0.0363 -0.8510 0.1141 0.7004
prcpTempCorr_anom 0.0067 -0.5936 0.0098 0.5875
prcp_anom 0.0043 -1.5324 0.0039 0.7910
prcp_seasonality 0.9824 0.4564 0.9408 2.2319
prcp_seasonality_anom -0.0106 -0.7055 -0.0047 0.4421
precp_dry_anom -0.0238 -6.7500 0.0375 1.0000
sand 46.6132 0.0000 45.3292 99.0864
t_warm_anom -0.4210 -5.5689 -0.4189 6.0847
tmax_anom -0.2491 -5.1921 -0.2763 3.7520
tmean 11.0772 -2.2524 9.8504 24.6330
tmin_anom -0.4508 -5.5408 -0.4156 5.0203

Histograms of raw and scaled predictors

scaleFigDat_1 <- modDat_1_s %>% 
  dplyr::select(c(x, y, Year, prednames)) %>% 
  pivot_longer(cols = all_of(names(prednames)), 
               names_to = "predNames", 
               values_to = "predValues_unScaled")
scaleFigDat_2 <- modDat_1_s %>% 
  dplyr::select(c(x, y, Year, paste0(prednames, "_s"))) %>% 
  pivot_longer(cols = all_of(paste0(prednames,"_s"
                                    )), 
               names_to = "predNames", 
               values_to = "predValues_scaled", 
               names_sep = ) %>% 
  mutate(predNames = str_replace(predNames, pattern = "_s$", replacement = ""))

scaleFigDat_3 <- scaleFigDat_1 %>% 
  left_join(scaleFigDat_2)

ggplot(scaleFigDat_3) + 
  facet_wrap(~predNames, scales = "free") +
  geom_histogram(aes(predValues_unScaled), fill = "lightgrey", col = "darkgrey") + 
  geom_histogram(aes(predValues_scaled), fill = "lightblue", col = "blue") +
  xlab ("predictor variable values") + 
  ggtitle("Comparing the distribution of unscaled (grey) to scaled (blue) predictor variables")

modDat_1_s <- modDat_1_s %>% 
  rename_with(~paste0(.x, "_raw"), 
                any_of(names(prednames))) %>% 
  rename_with(~str_remove(.x, "_s$"), 
              any_of(paste0(names(prednames), "_s")))

Predictor variables compared to binned response variables

set.seed(12011993)
# vector of name of response variables
vars_response <- response
# longformat dataframes for making boxplots
df_sample_plots <-  modDat_1_s  %>% 
  slice_sample(n = 5e4) %>% 
   rename(response = all_of(response)) %>% 
  mutate(response = case_when(
    response <= .25 ~ ".25", 
    response > .25 & response <=.50 ~ ".50", 
    response > .50 & response <=.75 ~ ".75", 
    response >= .75  ~ "1", 
  )) %>% 
  dplyr::select(c(response, prednames)) %>% 
  tidyr::pivot_longer(cols = unname(prednames), 
               names_to = "predictor", 
               values_to = "value"
               )  
 

  ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
  geom_boxplot() +
  facet_wrap(~predictor , scales = 'free_y') + 
  ylab("Predictor Variable Values") + 
    xlab(response)

Model Fitting

Visualize the spatial blocks and how they differ across environmental space

First, if there are observations in Louisiana, sub-sample them so they’re not so over-represented in the dataset

## make data into spatial format
modDat_1_sf <- modDat_1_s %>% 
  st_as_sf(coords = c("x", "y"), crs = st_crs("EPSG:4326"))


# download map info for visualization
data(state_boundaries_wgs84) 

cropped_states <- suppressMessages(state_boundaries_wgs84 %>%
  dplyr::filter(NAME!="Hawaii") %>%
  dplyr::filter(NAME!="Alaska") %>%
  dplyr::filter(NAME!="Puerto Rico") %>%
  dplyr::filter(NAME!="American Samoa") %>%
  dplyr::filter(NAME!="Guam") %>%
  dplyr::filter(NAME!="Commonwealth of the Northern Mariana Islands") %>%
  dplyr::filter(NAME!="United States Virgin Islands") %>%
  sf::st_sf() %>%
  sf::st_transform(sf::st_crs(modDat_1_sf))) #%>%
  #sf::st_crop(sf::st_bbox(modDat_1_sf)+c(-1,-1,1,1))
if (ecoregion %in% c("Forest", "eastForest", "forest")){
modDat_1_s$uniqueID <- 1:nrow(modDat_1_s)
modDat_1_sf$uniqueID <- 1:nrow(modDat_1_sf)

# find which observations overlap with Louisiana
obs_LA_temp <- st_intersects(modDat_1_sf, cropped_states[cropped_states$NAME == "Louisiana",], sparse = FALSE)
if (sum(obs_LA_temp) > 0) {
 obs_LA_1 <- modDat_1_sf[which(obs_LA_temp == TRUE, arr.ind = TRUE)[,1],]
# now, find only those within the oversampled area (near the coast)
dims <- c(xmin = 730439.1, xmax = 1042195.5, ymax = -1222745.2, ymin = -1390430.9)
badBox <- st_bbox(dims) %>% 
  st_as_sfc() %>% 
  st_set_crs(value = st_crs(modDat_1_sf))
  
obs_LA_temp2 <- st_intersects(obs_LA_1, badBox, sparse = FALSE)
obs_LA_2 <- obs_LA_1[which(obs_LA_temp2 == TRUE, arr.ind = TRUE)[,1],]

# subsample so there aren't so many 
# get every 7th observation
obs_LA_sampled <- obs_LA_2[seq(from = 1, to = nrow(obs_LA_2), by = 10),]
# remove observations that aren't sampled from the larger dataset
obsToRemove <- obs_LA_2[!(obs_LA_2$uniqueID %in% obs_LA_sampled$uniqueID),]

modDat_1_sf <- modDat_1_sf[!(modDat_1_sf$uniqueID %in% obsToRemove$uniqueID),]
modDat_1_s <- modDat_1_s[!(modDat_1_s$uniqueID %in% obsToRemove$uniqueID),] 
}
}
## do a pca of climate across level 2 ecoregions
pca <- prcomp(modDat_1_s[,paste0(prednames)])
library(factoextra)
(fviz_pca_ind(pca, habillage = modDat_1_s$NA_L2NAME, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))

# for ecoregions that are very tiny/extremely different from the rest of the other recoregions climatically, aggregate them together
if(ecoregion == "noTrees") {
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("EVERGLADES", "SOUTHEASTERN USA PLAINS", "TEXAS-LOUISIANA COASTAL PLAIN", "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "TAMAULIPAS-TEXAS SEMIARID PLAIN"), "NA_L2NAME"] <- "SOUTHEASTERN USA PLAINS AND EVERGLADES AND LOUISIANA PLAIN"
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("EVERGLADES", "SOUTHEASTERN USA PLAINS", "TEXAS-LOUISIANA COASTAL PLAIN", "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "TAMAULIPAS-TEXAS SEMIARID PLAIN"), "NA_L2NAME"] <- "SOUTHEASTERN USA PLAINS AND EVERGLADES AND LOUISIANA PLAIN"

}

# make a table of n for each region
modDat_1_s %>% 
  group_by(NA_L2NAME) %>% 
  dplyr::summarize("Number_Of_Observations" = length(NA_L2NAME)) %>% 
  rename("Level_2_Ecoregion" = NA_L2NAME)%>% 
  kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Level_2_Ecoregion Number_Of_Observations
COLD DESERTS 185083
MARINE WEST COAST FOREST 556
MEDITERRANEAN CALIFORNIA 17944
MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS 48
MIXED WOOD PLAINS 336
MIXED WOOD SHIELD 52
OZARK/OUACHITA-APPALACHIAN FORESTS 384
SOUTH CENTRAL SEMIARID PRAIRIES 109813
SOUTHEASTERN USA PLAINS 2112
TAMAULIPAS-TEXAS SEMIARID PLAIN 7380
TEMPERATE PRAIRIES 14700
TEXAS-LOUISIANA COASTAL PLAIN 4135
UPPER GILA MOUNTAINS 7284
WARM DESERTS 67673
WEST-CENTRAL SEMIARID PRAIRIES 89386
WESTERN CORDILLERA 40645
WESTERN SIERRA MADRE PIEDMONT 6884

Then, look at the spatial distribution and environmental characteristics of the grouped ecoregions

map1 <- ggplot() +
  geom_sf(data=cropped_states,fill='white') +
  geom_sf(data=modDat_1_sf#[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
          ,
          aes(fill=as.factor(NA_L2NAME)),linewidth=0.5,alpha=0.5) +
  geom_point(data=modDat_1_s#[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
             ,
             alpha=0.5, 
             aes(x = x, y = y, color=as.factor(NA_L2NAME)), alpha = .1) +
  #scale_fill_okabeito() +
  #scale_color_okabeito() +
 # theme_default() +
  theme(legend.position = 'none') +
  labs(title = "Level 2 Ecoregions as spatial blocks")

hull <- modDat_1_sf %>%
  ungroup() %>%
  group_by(NA_L2NAME) %>%
  slice(chull(tmean, prcp))

plot1<-ggplot(data=modDat_1_sf,aes(x=tmean,y=prcp)) +
  geom_polygon(data = hull, alpha = 0.25,aes(fill=NA_L2NAME) )+
  geom_point(aes(group=NA_L2NAME,color=NA_L2NAME),alpha=0.25) +
  theme_minimal() + xlab("Annual Average T_mean - long-term average") +
  ylab("Annual Average Precip - long-term average") #+
  #scale_color_okabeito() +
  #scale_fill_okabeito()

plot2<-ggplot(data=modDat_1_sf %>%
                pivot_longer(cols=tmean:prcp),
              aes(x=value,group=name)) +
  # geom_polygon(data = hull, alpha = 0.25,aes(fill=fold) )+
  geom_density(aes(group=NA_L2NAME,fill=NA_L2NAME),alpha=0.25) +
  theme_minimal() +
  facet_wrap(~name,scales='free')# +
  #scale_color_okabeito() +
  #scale_fill_okabeito()
 
library(patchwork)
(combo <- (map1+plot1)/plot2) 

# 
# ggplot(data = modDat_1_s) +
#   geom_density(aes(ShrubCover_transformed, col = NA_L2NAME)) +
#   xlim(c(0,100))

Fit a global model with all of the data

First, fit a LASSO regression model using the glmnet R package

  • This regression is a beta glm with a logit link (using custom function from Daniel)
  • Use cross validation across level 2 ecoregions to tune the lambda parameter in the LASSO model
  • this model is fit to using the scaled weather/climate/soils variables
  • this list of possible predictors includes:
    1. main effects
    2. interactions between all soils variables
    3. interactions between climate and weather variables
    4. transformed main effects (squared, log-transformed (add a uniform integer – 20– to all variables prior to log-transformation), square root-transformed (add a uniform integer – 20– to all variables prior to log-transformation))

Get rid of transformed predictions and interactions that are correlated

# get first pass of names correlated variables
X_df <- X %>% 
  as.data.frame() %>% 
  dplyr::select(-'(Intercept)')  
corrNames_i <- X_df %>% 
  cor()  %>% 
   caret::findCorrelation(cutoff = .7, verbose = FALSE, names = TRUE, exact = TRUE)
# remove those names that are untransformed main effects 
  # vector of columns to remove 
badNames <- corrNames_i[!(corrNames_i %in% prednames)]
while (sum(!(corrNames_i %in% prednames))>0) {
 corrNames_i <-  X_df %>% 
    dplyr::select(-badNames) %>% 
     cor()  %>% 
   caret::findCorrelation(cutoff = .7, verbose = FALSE, names = TRUE, exact = TRUE)
 # update the vector of names to remove 
 badNames <- unique(c(badNames, corrNames_i[!(corrNames_i %in% prednames)]))
}

## see if there are any correlated variables left (would be all main effects at this point)
# if there are, step through and remove the variable that each is most correlated with 
if (length(corrNames_i)>1) {
  for (i in 1:length(corrNames_i)) {
    X_i <- X_df %>% 
      dplyr::select(-badNames)
    if (corrNames_i[i] %in% names(X_i)) {
    corMat_i <- cor(x = X_i[corrNames_i[i]], y = X_i %>% dplyr::select(-corrNames_i[i])) 
    badNames_i <- colnames(corMat_i)[abs(corMat_i)>=.7]
    # if there are any predictors in the 'badNames_i', remove them from this list
    if (length(badNames_i) > 0 & sum(c(badNames_i %in% prednames))>0) {
        badNames_i <- badNames_i[!(badNames_i %in% prednames)]
    }
    badNames <- unique(c(badNames, badNames_i))
    }
  }
}
## update the X matrix to exclude these correlated variables
X <- X[,!(colnames(X) %in% badNames)]
if (run == TRUE) {
  # set up custom folds
    # get the ecoregions for training lambda
  train_eco <- modDat_1_s$NA_L2NAME#[train]
  
  # Fit model -----------------------------------------------
  # specify leave-one-year-out cross-validation
  my_folds <- as.numeric(as.factor(train_eco))

    # set up parallel processing
    library(doMC)
    # this computer has 16 cores (parallel::detectCores())
    registerDoMC(cores = 8)
    
    fit <- cv.glmnet(
      x = X[,2:ncol(X)], 
      y = y, 
      family = quasibeta_family(),
      keep = FALSE,
      alpha = 1,  # 0 == ridge regression, 1 == lasso, 0.5 ~~ elastic net
      lambda = lambdas,
      relax = ifelse(response == "ShrubCover", yes = TRUE, no = FALSE),
      #nlambda = 100,
      type.measure="mse",
      #penalty.factor = pen_facts,
      foldid = my_folds,
      #thresh = thresh,
      standardize = FALSE, ## scales variables prior to the model sequence... coefficients are always returned on the original scale
      parallel = TRUE
    )
     base::saveRDS(fit, paste0("../ModelFitting/models/betaLASSO/", response, "_globalLASSOmod_betaLogitLink_",ecoregion,"_removeAnomalies", removeAllAnoms,".rds"))
    
    
     best_lambda <- fit$lambda.min
  # save the lambda for the most regularized model w/ an MSE that is still 1SE w/in the best lambda model
  lambda_1SE <- fit$lambda.1se
  # save the lambda for the most regularized model w/ an MSE that is still .5SE w/in the best lambda model
  lambda_halfSE <- best_lambda + ((lambda_1SE - best_lambda)/2)
 
## Now, we need to do stability selection to ensure the coefficients that are being chosen with each lambda are stable 

## stability selection for best lambda model 
# setup params
p <- ncol(X[,2:ncol(X)]) # of parameters
n <- length(y) # of observations
n_iter <- 100        # number of subsamples
sample_frac <- 0.75  # fraction of data to subsample
lambda_val <- best_lambda    # fixed lambda value (could be chosen via CV)

# Track selection
selection_counts_bestL <- matrix(0, nrow = p, ncol = 1)

for (i in 1:n_iter) {
  # Subsample rows
  sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
  X_sub <- X[sample_idx, ]
  y_sub <- y[sample_idx]

  # Fit Lasso model
  fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub, 
    family = quasibeta_family(),
    alpha = 1, lambda = lambda_val, standardize = FALSE)

  # Get non-zero coefficients (excluding intercept)
  select_bestL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
  selection_counts_bestL[select_bestL] <- selection_counts_bestL[select_bestL] + 1
}

# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob_bestL <- selection_counts_bestL / n_iter
selection_prob_bestL_df <- data.frame(
  VariableNumber = paste0("X", 1:p),
  VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
  SelectionProb = as.vector(selection_prob_bestL)
)

# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
bestLambda_coef <- selection_prob_bestL_df[selection_prob_bestL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]

#//////
# stability selection for 1se lambda model
lambda_val <-  lambda_1SE    # fixed lambda value (could be chosen via CV)

# Track selection
selection_counts_1seL <- matrix(0, nrow = p, ncol = 1)

for (i in 1:n_iter) {
  # Subsample rows
  sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
  X_sub <- X[sample_idx, ]
  y_sub <- y[sample_idx]

  # Fit Lasso model
  fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub, 
    family = quasibeta_family(),
    alpha = 1, lambda = lambda_val, standardize = FALSE)

  # Get non-zero coefficients (excluding intercept)
  selected_1seL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
  selection_counts_1seL[selected_1seL] <- selection_counts_1seL[selected_1seL] + 1
}

# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob_1seL <- selection_counts_1seL / n_iter
selection_prob_1seL_df <- data.frame(
  VariableNumber = paste0("X", 1:p),
  VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
  SelectionProb = as.vector(selection_prob_1seL)
)

# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
seLambda_coef <- selection_prob_1seL_df[selection_prob_1seL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]

# stability selection for half se lambda model
lambda_val <- lambda_halfSE    # fixed lambda value (could be chosen via CV)

# Track selection
selection_counts_halfseL <- matrix(0, nrow = p, ncol = 1)

for (i in 1:n_iter) {
  # Subsample rows
  sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
  X_sub <- X[sample_idx, ]
  y_sub <- y[sample_idx]

  # Fit Lasso model
  fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub, 
    family = quasibeta_family(),
    alpha = 1, lambda = lambda_val, standardize = FALSE)

  # Get non-zero coefficients (excluding intercept)
  selected_halfseL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
  selection_counts_halfseL[selected_halfseL] <- selection_counts_halfseL[selected_halfseL] + 1
}

# Convert counts to selection probabilities (the probability that a variable is selected_halfseL over 100 iterations)
selection_prob_halfseL <- selection_counts_halfseL / n_iter
selection_prob_halfseL_df <- data.frame(
  VariableNumber = paste0("X", 1:p),
  VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
  SelectionProb = as.vector(selection_prob_halfseL)
)

# get those variables that are selected_halfseL_halfseL in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
halfseLambda_coef <- selection_prob_halfseL_df[selection_prob_halfseL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]

## If prompted to do so by the input arguments, remove any predictors that are for weather anomalies whose corresponding climate predictor is not included in the model 
if (trimAnom == TRUE) {
  # get unique predictors
  bestLamb_all <- bestLambda_coef$VariableName %>% #[str_detect(bestLambda_coef$VariableName, pattern = "_anom_s")] %>% 
    str_remove(pattern = "I\\(") %>% 
    str_remove(pattern = "\\^2\\)") %>% 
    str_remove(pattern = "\\^2\\)") %>% 
    str_split(pattern = ":", simplify = TRUE) 
  if (ncol(bestLamb_all) == 1) {
    bestLamb_all <- unique(bestLamb_all)
  } else {
    bestLamb_all <-  c(bestLamb_all[bestLamb_all[,1] !="",1], bestLamb_all[bestLamb_all[,2] !="",2]) %>% 
      unique()
  }
  # get anomalies
   bestLamb_anom <- bestLamb_all[bestLamb_all %in% prednames_weath]
  # get climate
   bestLamb_clim <- bestLamb_all[bestLamb_all %in% prednames_clim]
  # which anomalies are present in the predictors, but their corresponding climate variable isn't
   bestLamb_anomsWithMissingClim <- bestLamb_anom[!(str_remove(bestLamb_anom, "_anom") %in% bestLamb_clim)]
   # remove anomalies (and all interaction terms w/ those anomalies) from the predictor list

   if (length(bestLamb_anomsWithMissingClim) != 0) {
   
        bestLambda_coef_NEW <- bestLambda_coef
      for (i in 1:length(bestLamb_anomsWithMissingClim)) {
     bestLambda_coef_NEW <- bestLambda_coef_NEW[!str_detect(bestLambda_coef_NEW$VariableName, pattern = bestLamb_anomsWithMissingClim[i]),]
      }
     
   bestLambda_coef <- bestLambda_coef_NEW
   }
   
   
  #//// 1 se lambda model
      if (nrow(seLambda_coef) !=0) {
        # get unique predictors
  seLamb_all <- seLambda_coef$VariableName %>% #[str_detect(seLambda_coef$VariableName, pattern = "_anom_s")] %>% 
    str_remove(pattern = "I\\(") %>% 
    str_remove(pattern = "_s\\^2\\)") %>% 
    str_remove(pattern = "\\^2\\)") %>% 
    str_split(pattern = ":", simplify = TRUE) 
  if (ncol(seLamb_all) == 1) {
    seLamb_all <- unique(seLamb_all)
  } else {
    seLamb_all <-  c(seLamb_all[seLamb_all[,1] !="",1], seLamb_all[seLamb_all[,2] !="",2]) %>% 
      unique()
  }
  # get anomalies
   seLamb_anom <- seLamb_all[seLamb_all %in% prednames_weath]
  # get climate
   seLamb_clim <- seLamb_all[seLamb_all %in% prednames_clim]
  # which anomalies are present in the predictors, but their corresponding climate variable isn't
   seLamb_anomsWithMissingClim <- seLamb_anom[!(str_remove(seLamb_anom, "_anom") %in%  seLamb_clim)]
   # remove anomalies (and all interaction terms w/ those anomalies) from the predictor list
      if (length(seLamb_anomsWithMissingClim) != 0) {
    seLambda_coef_NEW <- seLambda_coef
   for (i in 1:length(seLamb_anomsWithMissingClim)) {
     seLambda_coef_NEW <- seLambda_coef_NEW[!str_detect(seLambda_coef_NEW$VariableName, pattern = seLamb_anomsWithMissingClim[i]),]
   }
   seLambda_coef <- seLambda_coef_NEW
      }
   }
  
  #//// 1 se lambda model
      if (nrow(halfseLambda_coef) !=0) {
        # get unique predictors
  halfseLamball<- halfseLambda_coef$VariableName %>% #[str_detect(halfseLambda_coef$VariableName, pattern = "_anom_s")] %>% 
    str_remove(pattern = "I\\(") %>% 
    str_remove(pattern = "_s\\^2\\)") %>% 
    str_remove(pattern = "\\^2\\)") %>% 
    str_split(pattern = ":", simplify = TRUE) 
  
    if (ncol(halfseLamball) == 1) {
    halfseLamball <- unique(halfseLamball)
  } else {
    halfseLamball <-  c(halfseLamball[halfseLamball[,1] !="",1], halfseLamball[halfseLamball[,2] !="",2]) %>% 
      unique()
  }
  
  # get anomalies
   halfseLambanom <- halfseLamball[halfseLamball %in% prednames_weath]
  # get climate
   halfseLambclim <- halfseLamball[halfseLamball %in% prednames_clim]
  # which anomalies are present in the predictors, but their corresponding climate variable isn't
   halfseLambanomsWithMissingClim <- halfseLambanom[!(str_remove(halfseLambanom, "_anom_s") %in%  halfseLambclim)]
   # remove anomalies (and all interaction terms w/ those anomalies) from the predictor list

   
   ##
        if (length(halfseLambanomsWithMissingClim) != 0) {
   halfseLambda_coef_NEW <- halfseLambda_coef
   for (i in 1:length(halfseLambanomsWithMissingClim)) {
     halfseLambda_coef_NEW <- halfseLambda_coef_NEW[!str_detect(halfseLambda_coef_NEW$VariableName, pattern = halfseLambanomsWithMissingClim[i]),]
   }
   halfseLambda_coef <- halfseLambda_coef_NEW
      }
   }
    ##
  
      }


## 
## fit w/ the identified coefficients from the 'best' lambda, but using the glm function
  mat_glmnet_best <- bestLambda_coef$VariableName 

  if (length(mat_glmnet_best) == 0) {
    f_glm_bestLambda <- as.formula(paste0(response, "~ 1"))
  } else {
  f_glm_bestLambda <- as.formula(paste0(response, " ~ ", paste0(mat_glmnet_best, collapse = " + ")))
  }
  
## fit using betareg
  fit_glm_bestLambda <- fit_glm_bestLambda_quasiBETA <- glm(formula = f_glm_bestLambda, data = modDat_1_s, family =quasibeta_family(link = "logit") )
    
  #fit_glm_bestLambda_BETAREG <- betareg(formula =  f_glm_bestLambda, data = modDat_1_s, link = "logit")
  ## fit using quasi-beta
  # ## compare coefficients
  # modCoeffsTest <- data.frame("coefficientName" = names(coefficients(fit_glm_bestLambda_BETAREG)),
  #                             "coefficientValue" = coefficients(fit_glm_bestLambda_BETAREG), 
  #                             "confint_low" = confint(fit_glm_bestLambda_BETAREG)[,1],
  #                             "confint_high" = confint(fit_glm_bestLambda_BETAREG)[,2],
  #                             "type" = "betareg") %>% 
  #   rbind(data.frame("coefficientName" = names(coefficients(fit_glm_bestLambda_quasiBETA)),
  #                             "coefficientValue" = coefficients(fit_glm_bestLambda_quasiBETA), 
  #                             "confint_low" = coefficients(fit_glm_bestLambda_quasiBETA) - ( summary(fit_glm_bestLambda_quasiBETA)$coefficients[,2] * 1.96) ,
  #                             "confint_high" = coefficients(fit_glm_bestLambda_quasiBETA) + ( summary(fit_glm_bestLambda_quasiBETA)$coefficients[,2] * 1.96),
  #                             "type" = "quasiBeta")) %>% 
  #   filter(coefficientName != "(phi)")
  # ggplot(data = modCoeffsTest) + 
  #   geom_point(aes(x = coefficientValue, y = coefficientName, col = type)) + 
  #   geom_errorbar(aes(xmin = confint_low, xmax = confint_high, y = coefficientName, col = type)) + 
  #   geom_vline(aes(xintercept = 0), lty = 3)
  # 
   ## fit w/ the identified coefficients from the '1se' lambda, but using the glm function
  mat_glmnet_1se <- seLambda_coef$VariableName
    
  if (length(mat_glmnet_1se) == 0) {
    f_glm_1se <- as.formula(paste0(response, "~ 1"))
  } else {
  f_glm_1se <- as.formula(paste0(response, " ~ ", paste0(mat_glmnet_1se, collapse = " + ")))
  }


  fit_glm_se <- glm(formula = f_glm_1se, data = modDat_1_s, family =quasibeta_family(link = "logit") )
  # glm(data = modDat_1_s, formula = f_glm_1se,
  #                   family =  stats::Gamma(link = "log"))
  
     ## fit w/ the identified coefficients from the '.5se' lambda, but using the glm function
  mat_glmnet_halfse <- halfseLambda_coef$VariableName
  
  if (length(mat_glmnet_halfse) == 0) {
    f_glm_halfse <- as.formula(paste0(response, "~ 1"))
  } else {
  f_glm_halfse <- as.formula(paste0(response, " ~ ", paste0(mat_glmnet_halfse, collapse = " + ")))
  }

  fit_glm_halfse <- glm(formula = f_glm_halfse, data = modDat_1_s, family =quasibeta_family(link = "logit") )
  
  ## save models 
  if (trimAnom == TRUE) {
    saveRDS(fit_glm_bestLambda, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_bestLambdaGLM.rds"))
  saveRDS(fit_glm_halfse, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_halfSELambdaGLM.rds"))
  saveRDS(fit_glm_se, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_oneSELambdaGLM.rds"))
  } else {
    saveRDS(fit_glm_bestLambda, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_bestLambdaGLM.rds"))
  saveRDS(fit_glm_halfse, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_halfSELambdaGLM.rds"))
  saveRDS(fit_glm_se, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_oneSELambdaGLM.rds"))
    
  }
  
  ## save the R environment after running the models 
  save(f_glm_halfse, mat_glmnet_halfse, halfseLambda_coef,
              f_glm_1se, mat_glmnet_1se, seLambda_coef,
              f_glm_bestLambda, mat_glmnet_best, bestLambda_coef,
              file = paste0("./models/interimModelFittingObjects_", response, "_", ecoregion, "_removeAnomalies", removeAllAnoms,"_betaReg.rds" ))
  } else {
    # read in LASSO object
    fit <- readRDS(paste0("../ModelFitting/models/betaLASSO/", response, "_globalLASSOmod_betaLogitLink_",ecoregion,"_removeAnomalies", removeAllAnoms,".rds"))
    
    # read in R objects having to do w/ model fitting 
    load(file = paste0("./models/interimModelFittingObjects_", response, "_", ecoregion, "_removeAnomalies", removeAllAnoms,"_betaReg.rds" ))
      
    if (trimAnom == TRUE) {
    fit_glm_bestLambda <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_bestLambdaGLM.rds"))
  fit_glm_halfse <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_halfSELambdaGLM.rds"))
 fit_glm_se <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_oneSELambdaGLM.rds"))
  } else {
    fit_glm_bestLambda <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_bestLambdaGLM.rds"))
  fit_glm_halfse <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_halfSELambdaGLM.rds"))
  fit_glm_se <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_oneSELambdaGLM.rds"))
    
  }
    
  }


  # assess model fit
  # assess.glmnet(fit$fit.preval, #newx = X[,2:293], 
  #               newy = y, family = stats::Gamma(link = "log"))
  # save the minimum lambda
  best_lambda <- fit$lambda.min
  # save the lambda for the most regularized model w/ an MSE that is still 1SE w/in the best lambda model
  lambda_1SE <- fit$lambda.1se
  # save the lambda for the most regularized model w/ an MSE that is still .5SE w/in the best lambda model
  lambda_halfSE <- best_lambda + ((lambda_1SE - best_lambda)/2)
 
  print(fit)     
## 
## Call:  cv.glmnet(x = X[, 2:ncol(X)], y = y, lambda = lambdas, type.measure = "mse",      foldid = my_folds, keep = FALSE, parallel = TRUE, relax = ifelse(response ==          "ShrubCover", yes = TRUE, no = FALSE), family = quasibeta_family(),      alpha = 1, standardize = FALSE) 
## 
## Measure: Mean-Squared Error 
## 
##       Lambda Index Measure       SE Nonzero
## min 0.001482   128 0.01270 0.001667      60
## 1se 0.004199   113 0.01436 0.002088      27
  plot(fit)

Then, we predict (on the training set) using both of these models (best lambda and 1se lambda)

  ## predict on the test data
  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_glm_bestLambda, newx=X[,2:ncol(X)], type = "response")
  optimal_pred_1se <-  predict(fit_glm_se, newx=X[,2:ncol(X)], type = "response")
  optimal_pred_halfse <- predict(fit_glm_halfse, newx = X[,2:ncol(X)], type = "response")
  
    null_fit <- glm(#data = data.frame("y" = y, X[,paste0(prednames, "_s")]), 
      formula = y ~ 1, #data = modDat_1_s, 
      family = quasibeta_family(link = "logit")
      #formula = y ~ 1, family = stats::Gamma(link = "log")
      )
  null_pred <- predict(null_fit, newdata = as.data.frame(X), type = "response"
                       )

  # ## snap values above 100 and below 0 to be 100 or z
  # #optimal_pred[optimal_pred>100] <- 100
  # optimal_pred[optimal_pred<0] <- 0
  # #optimal_pred_1se[optimal_pred_1se>100] <- 100
  # optimal_pred_1se[optimal_pred_1se<0] <- 0
  # #optimal_pred_halfse[optimal_pred_halfse>100] <- 100
  # optimal_pred_halfse[optimal_pred_halfse<0] <- 0
  # 
  # save data
  fullModOut <- list(
    "modelObject" = fit,
    "nullModelObject" = null_fit,
    "modelPredictions" = data.frame(#ecoRegion_holdout = rep(test_eco,length(y)),
      obs=y,
                    pred_opt=optimal_pred, 
                    pred_opt_se = optimal_pred_1se,
                    pred_opt_halfse = optimal_pred_halfse,
                    pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ))

ggplot() + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$obs), col = "black", alpha = .1) + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt), col = "red", alpha = .1) + ## predictions w/ the CV model
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_halfse), col = "orange", alpha = .1) + ## predictions w/ the CV model (.5se lambda)
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_se), col = "green", alpha = .1) + ## predictions w/ the CV model (1se lambda)
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_null), col = "blue", alpha = .1) + 
  labs(title = "A rough comparison of observed and model-predicted values", 
       subtitle = "black = observed values \n red = predictions from 'best lambda' model \n orange = predictions for '1/2se' lambda model \n green = predictions from '1se' lambda model \n blue = predictions from null model") +
  xlab(colnames(X)[2])

  #ylim(c(0,200))

The internal cross-validation process to fit the global LASSO model identified an optimal lambda value (regularization parameter) of r{print(best_lambda)}. The lambda value such that the cross validation error is within 1 standard error of the minimum (“1se lambda”) was `r{print(fit$lambda.1se)}`` . The following coefficients were kept in each model:

# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
coef_glm_bestLambda <- coef(fit_glm_bestLambda) %>% 
  data.frame() 
coef_glm_bestLambda$coefficientName <- rownames(coef_glm_bestLambda)
names(coef_glm_bestLambda)[1] <- "coefficientValue_bestLambda"
# coefficient matrix from the '1se' model 
coef_glm_1se <- coef(fit_glm_se) %>% 
  data.frame() 
coef_glm_1se$coefficientName <- rownames(coef_glm_1se)
names(coef_glm_1se)[1] <- "coefficientValue_1seLambda"
# coefficient matrix from the 'half se' model 
coef_glm_halfse <- coef(fit_glm_halfse) %>% 
  data.frame() 
coef_glm_halfse$coefficientName <- rownames(coef_glm_halfse)
names(coef_glm_halfse)[1] <- "coefficientValue_halfseLambda"
# add together
coefs <- full_join(coef_glm_bestLambda, coef_glm_halfse) %>% 
  full_join(coef_glm_1se) %>% 
  dplyr::select(coefficientName, coefficientValue_bestLambda,
                coefficientValue_halfseLambda, coefficientValue_1seLambda)

globModTerms <- coefs[!is.na(coefs$coefficientValue_bestLambda), "coefficientName"]

## also, get the number of unique variables in each model 
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
# for best lambda model
prednames_fig <- paste(str_split(globModTerms, ":", simplify = TRUE)) 
prednames_fig <- str_replace(prednames_fig, "I\\(", "")
prednames_fig <- str_replace(prednames_fig, "\\^2\\)", "")
prednames_fig <- unique(prednames_fig[prednames_fig>0])
prednames_fig <- prednames_fig
prednames_fig_num <- length(prednames_fig)
# for 1SE lambda model
globModTerms_1se <- coefs[!is.na(coefs$coefficientValue_1seLambda), "coefficientName"]
if (length(globModTerms_1se) == 1) {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE)) 
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- c(0)
} else {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE)) 
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- length(prednames_fig_1se)
}
# for 1/2SE lambda model
globModTerms_halfse <- coefs[!is.na(coefs$coefficientValue_halfseLambda), "coefficientName"]
if (length(globModTerms_halfse) == 1) {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE)) 
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- c(0)
} else {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE)) 
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- length(prednames_fig_halfse)
}
# make a table
kable(coefs, col.names = c("Coefficient Name", "Value from best lambda model", 
                           "Value from 1/2 se lambda", "Value from 1se lambda model")
      ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Coefficient Name Value from best lambda model Value from 1/2 se lambda Value from 1se lambda model
(Intercept) -1.8117495 -1.7945900 -1.7694928
tmean 0.0420468 0.0799076 0.0203071
prcp -1.0290382 -0.7249707 -0.7202501
prcpTempCorr -0.1302178 NA NA
isothermality 0.2408415 0.2954096 0.2777369
annWetDegDays -0.4350991 -0.6892232 -0.6051050
sand 0.0007494 -0.0072114 0.0097657
coarse -0.1909375 -0.2002561 -0.2301594
AWHC 0.0797108 0.0795450 0.0629004
prcp_anom 0.0684421 NA 0.0646436
prcp_seasonality_anom -0.0187926 NA NA
isothermality_anom -0.0086028 NA NA
annWetDegDays_anom 0.0206591 NA 0.0218393
I(tmean^2) -0.0400911 -0.0488557 -0.0470677
I(prcp^2) 0.0415890 0.0474086 0.1132394
I(prcpTempCorr^2) -0.2353772 -0.2113252 -0.2425328
I(prcp_seasonality_anom^2) -0.0053628 NA NA
I(sand^2) 0.0766532 0.0751029 0.0624226
I(AWHC^2) -0.0110367 -0.0124815 NA
prcp:annWetDegDays 0.1460402 0.1630710 0.0480336
annWetDegDays:prcp_seasonality_anom 0.0270093 NA NA
prcp:annWetDegDays_anom 0.0058933 NA NA
prcp_anom:annWetDegDays_anom 0.0064824 NA 0.0052755
prcp_seasonality_anom:annWetDegDays_anom 0.0093297 NA NA
prcp:isothermality -0.0074773 0.0398744 0.1367888
isothermality:prcp_seasonality -0.1155906 -0.1025795 NA
prcp:isothermality_anom 0.0080500 NA NA
prcp_anom:isothermality_anom -0.0137803 NA NA
prcp:prcp_seasonality -0.0250086 NA NA
prcp:prcpTempCorr -0.3448210 -0.2138782 NA
prcpTempCorr:prcp_anom 0.0387921 NA NA
prcp_anom:prcpTempCorr_anom 0.0088728 NA 0.0079677
prcpTempCorr:prcp_seasonality_anom -0.0274198 NA NA
prcp_seasonality_anom:prcpTempCorr_anom 0.0137646 NA NA
tmean:prcpTempCorr 0.1812364 0.1337496 0.1405301
sand:coarse -0.0171700 -0.0150358 NA
isothermality:prcpTempCorr NA 0.0878160 0.1239949
prcp_anom:prcpTempCorr NA NA 0.0389813
# calculate RMSE of all models 
RMSE_best <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt")], truth = "obs", estimate = "pred_opt")$.estimate
RMSE_halfse <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_halfse")], truth = "obs", estimate = "pred_opt_halfse")$.estimate
RMSE_1se <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_se")], truth = "obs", estimate = "pred_opt_se")$.estimate
# calculate bias of all models
bias_best <-  mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt)
bias_halfse <-  mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_halfse)
bias_1se <- mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_se)


uniqueCoeffs <- data.frame("Best lambda model" = c(signif(RMSE_best,3), as.character(signif(bias_best, 3)),
  as.integer(length(globModTerms)-1), as.integer(prednames_fig_num), 
                                                   as.integer(sum(prednames_fig %in% c(prednames_clim))),
                                                   as.integer(sum(prednames_fig %in% c(prednames_weath))),
                                                   as.integer(sum(prednames_fig %in% c(prednames_soils)))
                                                   ), 
                           "1/2 se lambda model" = c(signif(RMSE_halfse,3), as.character(signif(bias_halfse, 3)),
                             length(globModTerms_halfse)-1, prednames_fig_halfse_num,
                                                   sum(prednames_fig_halfse %in% c(prednames_clim)),
                                                   sum(prednames_fig_halfse %in% c(prednames_weath)),
                                                   sum(prednames_fig_halfse %in% c(prednames_soils))), 
                           "1se lambda model" = c(signif(RMSE_1se,3), as.character(signif(bias_1se, 3)),
                             length(globModTerms_1se)-1, prednames_fig_1se_num,
                                                   sum(prednames_fig_1se %in% c(prednames_clim)),
                                                   sum(prednames_fig_1se %in% c(prednames_weath)),
                                                   sum(prednames_fig_1se %in% c(prednames_soils))))
row.names(uniqueCoeffs) <- c("RMSE", "bias: mean(obs-pred.)", "Total number of coefficients", "Number of unique coefficients",
                             "Number of unique climate coefficients", 
                             "Number of unique weather coefficients",  
                             "Number of unique soils coefficients"
                             )

kable(uniqueCoeffs, 
      col.names = c("Best lambda model", "1/2 se lambda model", "1se lambda model"), row.names = TRUE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Best lambda model 1/2 se lambda model 1se lambda model
RMSE 0.0905 0.0919 0.0919
bias: mean(obs-pred.) -5.24e-11 -4.19e-11 -8.35e-13
Total number of coefficients 35 19 20
Number of unique coefficients 14 9 11
Number of unique climate coefficients 6 6 5
Number of unique weather coefficients 5 0 3
Number of unique soils coefficients 3 3 3

Visualizations of Model Predictions and Residuals – using best lambda model

observed vs. predicted values

As the alternative to the best lambda model, use the model (1se or 1/2se of best Lambda) that has the fewest number of unique predictors

if (whichSecondBestMod == "auto") {
  # name of model w/ fewest # of predictors (but more than 0)
uniqueCoeff_min <- min(as.numeric(uniqueCoeffs[4,2:3])[which(as.numeric(uniqueCoeffs[4,2:3]) > 0)])
alternativeModel <- names(uniqueCoeffs[4,2:3])[which(uniqueCoeffs[4,2:3] == uniqueCoeff_min)]

if (is.finite(uniqueCoeff_min)) {
  if (length(alternativeModel) == 1) {
  if (alternativeModel == "X1.2.se.lambda.model") {
  mod_secondBest <- fit_glm_halfse
  name_secondBestMod <- "1/2 SE Model"
  prednames_secondBestMod <- prednames_fig_halfse
} else if (alternativeModel == "X1se.lambda.model") {
  mod_secondBest <- fit_glm_se
  name_secondBestMod <- "1 SE Model"
  prednames_secondBestMod <- prednames_fig_1se
}
} else {
  # if both alternative models have the same number of unique coefficients, chose the model that has the fewest number of total predictors
  uniqueCoeff_min2 <- min(as.numeric(uniqueCoeffs[3,alternativeModel]))
alternativeModel2 <- names(uniqueCoeffs[3,alternativeModel])[which(uniqueCoeffs[3,alternativeModel] == uniqueCoeff_min2)]
# if both models still have the same number of unique coefficeints, chose the 1se lambda model
if (length(alternativeModel2 == 2)) {
  alternativeModel2 <- "X1se.lambda.model"
}
if (alternativeModel2 == "X1.2.se.lambda.model") {
  mod_secondBest <- fit_glm_halfse
  name_secondBestMod <- "1/2 SE Model"
  prednames_secondBestMod <- prednames_fig_halfse
} else if (alternativeModel2 == "X1se.lambda.model") {
  mod_secondBest <- fit_glm_se
  name_secondBestMod <- "1 SE Model"
  prednames_secondBestMod <- prednames_fig_1se
}
}
  }else {
    mod_secondBest <- NULL
  name_secondBestMod <- "Intercept_Only"
  prednames_secondBestMod <- NULL
}
} else if (whichSecondBestMod == "1se") {
  mod_secondBest <- fit_glm_se
  name_secondBestMod <- "1 SE Model"
  prednames_secondBestMod <- prednames_fig_1se
} else if (whichSecondBestMod == "halfse") {
  mod_secondBest <- fit_glm_halfse
  name_secondBestMod <- "1/2 SE Model"
  prednames_secondBestMod <- prednames_fig_halfse
}

Predicting on the data

  # create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
  df_out <- df
  response_name <- paste0(response, "_pred")
  preds <- predict(mod, newx= df_out, #s="lambda.min", 
                                     type = "response")
  preds[preds<0] <- 0
  #preds[preds>100] <- 100
  df_out <- df_out %>% cbind(preds)
   colnames(df_out)[ncol(df_out)] <- response_name
  return(df_out)
}

pred_glm1 <- predict_by_response(fit_glm_bestLambda, X[,2:ncol(X)])

## back-transform the 
# add back in true y values
pred_glm1 <- pred_glm1 %>% 
  cbind(data.frame(response = modDat_1_s[,c(response)])) 
names(pred_glm1)[names(pred_glm1) == "response"] <- response

# add back in lat/long data 
pred_glm1 <- pred_glm1 %>% 
  cbind(modDat_1_s[,c("x", "y", "Year")])

pred_glm1$resid  <- pred_glm1[,response] - pred_glm1[,paste0(response, "_pred")]
pred_glm1$extremeResid <- NA
pred_glm1[pred_glm1$resid > 1 | pred_glm1$resid < 1,"extremeResid"] <- 1
if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
} else {

pred_glm1_1se <- predict_by_response(mod_secondBest, X[,2:ncol(X)])

# add back in true y values
pred_glm1_1se <- pred_glm1_1se %>% 
  cbind(data.frame(response = modDat_1_s[,c(response)])) 
names(pred_glm1_1se)[names(pred_glm1_1se) == "response"] <- response

# add back in lat/long data 
pred_glm1_1se <- pred_glm1_1se %>% 
  cbind(modDat_1_s[,c("x", "y", "Year")])

pred_glm1_1se$resid <- pred_glm1_1se[,response] - pred_glm1_1se[,paste0(response, "_pred")]
pred_glm1_1se$extremeResid <- NA
pred_glm1_1se[pred_glm1_1se$resid > 1 | pred_glm1_1se$resid < -1,"extremeResid"] <- 1
}

Maps of Observations, Predictions, and Residuals=

Observations across the temporal range of the dataset

# rasterize
# get reference raster
# use the gridMet raster 
test_rast_dayMet <- rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>%
  #terra::project(terra::crs("EPSG:4326")) %>%
  terra::crop(ext(-2000000, 2600000, -1900000, 1500000))

## aggregate to a 4x4km^2 grid
test_rast <- test_rast_dayMet %>% 
  terra::aggregate(fact = 8, fun = "mean")
# test_rast_Untranfsormed <-  rast("../../../Data_raw/gridMet/pr_2020.nc") #%>% ## mean cell size of unaggregated grid size (16709380m^2)
# test_rast <- test_rast_Untranfsormed %>% 
#    terra::aggregate(fact = 2, fun = "mean") #%>% 
regions <- sf::st_read(dsn = "../../../Data_raw/Level2Ecoregions/", layer = "NA_CEC_Eco_Level2") 
## Reading layer `NA_CEC_Eco_Level2' from data source 
##   `/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/cleanPED/PED_vegClimModels/Data_raw/Level2Ecoregions' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2261 features and 8 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -4334052 ymin: -3313739 xmax: 3324076 ymax: 4267265
## Projected CRS: Sphere_ARC_INFO_Lambert_Azimuthal_Equal_Area
regions <- regions %>% 
  st_transform(crs = st_crs(crs(test_rast))) %>% 
  st_make_valid() 

# ecoregionLU <- data.frame("NA_L1NAME" = sort(unique(regions$NA_L1NAME)), 
#                         "newRegion" = c(NA, "Forest", "dryShrubGrass", 
#                                         "dryShrubGrass", "Forest", "dryShrubGrass",
#                                        "dryShrubGrass", "Forest", "Forest", 
#                                        "dryShrubGrass", "Forest", "Forest", 
#                                        "Forest", "Forest", "dryShrubGrass", 
#                                        NA
#                                         ))
# goodRegions <- regions %>% 
#   left_join(ecoregionLU)
# mapRegions <- goodRegions %>% 
#   filter(!is.na(newRegion)) %>% 
#   group_by(newRegion) %>% 
#   summarise(geometry = sf::st_union(geometry)) %>% 
#   ungroup() %>% 
#   st_simplify(dTolerance = 1000) %>% 
#   sf::st_crop(c("xmin" = -2000000, "xmax" = 2600000, "ymin" = -1900000, "ymax" = 1500000))
# make shapefile of cropped state boundaries in appropriate crs
cropped_states_2 <- cropped_states %>% 
  st_transform(crs = crs(test_rast)) %>% 
  st_make_valid()  %>% 
  sf::st_crop(c("xmin" = -2000000, "xmax" = 2600000, "ymin" = -1900000, "ymax" = 1500000))

# make a multi-layer raster w/ layers for each variable of interest
pred_glm1_sf <- pred_glm1 %>% 
  st_as_sf(coords = c("x", "y"), crs = crs("EPSG:4326")) %>% 
  st_transform(crs(test_rast)) %>% 
  terra::vect()

# rasterize the observations and predictions
testRast2 <- lapply(c(response, paste0(response, "_pred"), "resid"), FUN = function(x) {
  # S4 method for class 'data.frame'
temp <- terra::rasterize(x = pred_glm1_sf, y = test_rast, 
                         field = x, fun = mean, na.rm = TRUE)
names(temp) <- x
  # rast(pred_glm1_full[,c("x", "y", x)] %>% drop_na(), type="xyz", crs=terra::crs(test_rast$`precipitation_amount_day=43829`), digits=6, extent=test_rast$`precipitation_amount_day=43829`)
 return(temp)
  }
)
pred_glm1_RAST <- c(testRast2[[1]], testRast2[[2]], testRast2[[3]])

# ## compare
# par(mfrow = c(2,1))
# terra::plot(pred_glm1_RAST$TotalTreeCover,  maxcell = 30000000, main = "gridMet")
# terra::plot(pred_glm1_RAST_dayMet$TotalTreeCover, maxcell = 30000000, main = "dayMet")
# par(mfrow = c(1,1))
##
# make figure of observations
map_obs <- ggplot() +
geom_spatraster(data = pred_glm1_RAST, aes(fill = .data[[response]]), maxcell = 50000000) + 
 #geom_sf(data = pred_glm1_RAST, aes(col = TotalTreeCover)) + 
  # stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover), fun = mean, binwidth = .05) + 
  geom_sf(data=cropped_states_2 ,fill=NA ) +
labs(title = paste0("Observations of ", response, " in the ",ecoregion, " ecoregion")) +
  scale_fill_gradient2(low = "brown",
                       mid = "wheat" ,
                       high = "forestgreen" , 
                       midpoint = 0,   na.value = "white") + 
  coord_sf(datum = st_crs(pred_glm1_RAST)) +
   # xlim(-104, -100) + 
   # ylim(30, 33)
  xlim(-2000000, 2500000) + 
  ylim(-1800000, 900000)
# histogram of observations
hist_obs <- pred_glm1%>% 
  ggplot() + 
  geom_histogram(aes(.data[[response]]), fill = "lightgrey", col = "darkgrey")

library(ggpubr)
ggarrange( map_obs, hist_obs, heights = c(3, 1), ncol = 1)

# 
# # rasterize
# # get reference raster
# test_rast <-  rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>% 
#   terra::aggregate(fact = 8, fun = "mean")
# ## add ecoregion boundaries (for our ecoregion level model)
# regions <- sf::st_read(dsn = "../../../Data_raw/Level2Ecoregions/", layer = "NA_CEC_Eco_Level2") 
# regions <- regions %>% 
#   st_transform(crs = st_crs(test_rast)) %>% 
#   st_make_valid() 
# 
# ecoregionLU <- data.frame("NA_L1NAME" = sort(unique(regions$NA_L1NAME)), 
#                         "newRegion" = c(NA, "Forest", "dryShrubGrass", 
#                                         "dryShrubGrass", "Forest", "dryShrubGrass",
#                                        "dryShrubGrass", "Forest", "Forest", 
#                                        "dryShrubGrass", "Forest", "Forest", 
#                                        "Forest", "Forest", "dryShrubGrass", 
#                                        NA
#                                         ))
# goodRegions <- regions %>% 
#   left_join(ecoregionLU)
# mapRegions <- goodRegions %>% 
#   filter(!is.na(newRegion)) %>% 
#   group_by(newRegion) %>% 
#   summarise(geometry = sf::st_union(geometry)) %>% 
#   ungroup() %>% 
#   st_simplify(dTolerance = 1000)
# #mapview(mapRegions)
# # rasterize data
# plotObs <- pred_glm1 %>% 
#          drop_na(paste(response)) %>% 
#   #slice_sample(n = 5e4) %>%
#   terra::vect(geom = c("Long", "Lat")) %>% 
#   terra::set.crs(crs(test_rast)) %>% 
#   terra::rasterize(y = test_rast, 
#                    field = response, 
#                    fun = mean) #%>% 
#   #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
#   #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
# 
# # get the extent of this particular raster, and crop it accordingly
# tempExt <- crds(plotObs, na.rm = TRUE)
# 
# plotObs_2 <- plotObs %>% 
#   crop(ext(min(tempExt[,1]), max(tempExt[,1]),
#            min(tempExt[,2]), max(tempExt[,2])) 
#        )
# # make figures
# map_obs <- ggplot() +
# geom_spatraster(data = plotObs_2) + 
#   geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
#   geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
# labs(title = paste0("Observations of ", response, " in the ",ecoregion, " ecoregion")) +
#   scale_fill_gradient2(low = "brown",
#                        mid = "wheat" ,
#                        high = "darkgreen" , 
#                        midpoint = 0,   na.value = "grey20") + 
#   xlim(st_bbox(plotObs_2)[c(1,3)]) + 
#   ylim(st_bbox(plotObs_2)[c(2,4)])
# 
# hist_obs <- ggplot(pred_glm1) + 
#   geom_histogram(aes(.data[[response]]), fill = "lightgrey", col = "darkgrey")
# 
# library(ggpubr)
# ggarrange(map_obs, hist_obs, heights = c(3,1), ncol = 1)

Predictions across the temporal range of the dataset

map_preds1 <- ggplot() +
geom_spatraster(data = pred_glm1_RAST, aes(fill = .data[[paste0(response,"_pred")]]), maxcell = 50000000) +
  #geom_sf(data = highPred_points, col = "red") +
geom_sf(data=cropped_states_2 ,fill=NA ) +
  labs(title = paste0("Predictions from the 'best lambda' fitted model of ", response, " in the ",ecoregion, " ecoregion"),
     subtitle =  "bestLambda model")  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 1,   na.value = "white",
                       limits = c(0,1)) + 
  xlim(-2000000, 2500000) + 
  ylim(-1800000, 900000)

hist_preds1 <- ggplot(pred_glm1) + 
  geom_histogram(aes(.data[[paste0(response, "_pred")]]), fill = "lightgrey", col = "darkgrey")+ 
  xlim(c(0,1))

ggarrange(map_preds1, hist_preds1, heights = c(3,1), ncol = 1)

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
} else {

# rasterize data
# make a multi-layer raster w/ layers for each variable of interest
pred_glm1_1se_sf <- pred_glm1_1se %>% 
  st_as_sf(coords = c("x", "y"), crs = crs("EPSG:4326")) %>% 
  st_transform(crs(test_rast)) %>% 
  terra::vect()

# rasterize the observations and predictions
testRast3 <- lapply(c(response, paste0(response, "_pred"), "resid"), FUN = function(x) {
  # S4 method for class 'data.frame'
temp <- terra::rasterize(x = pred_glm1_1se_sf, y = test_rast, 
                         field = x, fun = mean, na.rm = TRUE)
names(temp) <- x
  # rast(pred_glm1_full[,c("x", "y", x)] %>% drop_na(), type="xyz", crs=terra::crs(test_rast$`precipitation_amount_day=43829`), digits=6, extent=test_rast$`precipitation_amount_day=43829`)
 return(temp)
  }
)
pred_glm1_1se_RAST <- c(testRast3[[1]], testRast3[[2]], testRast3[[3]])

# make figures
map_preds2 <- ggplot() +
geom_spatraster(data = pred_glm1_RAST, aes(fill = .data[[paste0(response,"_pred")]]), maxcell = 50000000) + 
 geom_sf(data=cropped_states_2 ,fill=NA ) +
labs(title = paste0("Predictions from the ", name_secondBestMod, "fitted model of ", response, " in the ",ecoregion, " ecoregion"),
     subtitle =  name_secondBestMod)  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 1,   na.value = "white",
                       limits = c(0, 1)) +
  xlim(-2000000, 2500000) + 
  ylim(-1800000, 900000)


hist_preds2 <- ggplot(pred_glm1_1se) + 
  geom_histogram(aes(.data[[paste0(response, "_pred")]]), fill = "lightgrey", col = "darkgrey")+ 
  xlim(c(0,1))

ggarrange(map_preds2, hist_preds2, heights = c(3,1), ncol = 1)
}

# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1 %>% 
  filter(resid > .5)  %>% 
  terra::vect(geom = c("x", "y")) %>% 
  terra::set.crs(crs("EPSG:4326")) %>% 
  terra::project(crs(test_rast))
badResids_low <- pred_glm1 %>% 
  filter(resid < -.5)  %>% 
  terra::vect(geom = c("x", "y")) %>% 
  terra::set.crs(crs("EPSG:4326")) %>% 
  terra::project(crs(test_rast))
# make figures
map <- ggplot() +
geom_spatraster(data = pred_glm1_RAST, aes(fill = resid), maxcell = 50000000) + 
  geom_sf(data=cropped_states_2 ,fill=NA ) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from the ", ecoregion," ecoregion-wide model of ", response),
     subtitle = "bestLambda model \n red points indicate locations that have residuals below -0.5 \n blue points indicate locations that have residuals above 0.5") +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" , 
                       midpoint = 0,   na.value = "grey30",
                       limits = c(-1,1)
                       ) +
  xlim(-2000000, 2500000) + 
  ylim(-1800000, 900000)

hist <- ggplot(pred_glm1) + 
  geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") + 
  geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
  geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2)))) + 
  geom_vline(aes(xintercept = mean(resid)))

ggarrange(map, hist, heights = c(3,1), ncol = 1)

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
} else {

# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1_1se %>% 
  filter(resid > .5)  %>% 
  terra::vect(geom = c("x", "y")) %>% 
  terra::set.crs(crs("EPSG:4326")) %>% 
  terra::project(crs(test_rast))
badResids_low <- pred_glm1_1se %>% 
  filter(resid < -.5)  %>% 
  terra::vect(geom = c("x", "y")) %>% 
  terra::set.crs(crs("EPSG:4326")) %>% 
  terra::project(crs(test_rast))
# make figures
map <- ggplot() +
geom_spatraster(data = pred_glm1_1se_RAST, aes(fill = resid), maxcell = 50000000) + 
  geom_sf(data=cropped_states_2 ,fill=NA ) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from the ", ecoregion," ecoregion-wide model of ", response),
     subtitle = "bestLambda model \n red points indicate locations that have residuals below -0.5 \n blue points indicate locations that have residuals above 0.5") +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" , 
                       midpoint = 0,   na.value = "grey30",
                       limits = c(-1,1)
                       ) +
  xlim(-2000000, 2500000) + 
  ylim(-1800000, 900000)

hist <- ggplot(pred_glm1_1se) + 
  geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") + 
  geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
  geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2)))) + 
  geom_vline(aes(xintercept = mean(resid)))

ggarrange(map, hist, heights = c(3,1), ncol = 1)

}

Are there biases of the model predictions across year/lat/long?

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
  # plot residuals against Year
yearResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = jitter(Year), y = resid), alpha = .1) + 
  geom_smooth(aes(x = Year, y = resid)) + 
  xlab("Year") + 
  ylab("Residuals") +
  ggtitle("from best lamba model")

# plot residuals against Lat
latResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = y, y = resid), alpha = .1) + 
  geom_smooth(aes(x = y, y = resid)) + 
  xlab("Latitude") + 
  ylab("Residuals") +
  ggtitle("from best lamba model")

# plot residuals against Long
longResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = x, y = resid), alpha = .1) + 
  geom_smooth(aes(x = x, y = resid)) + 
  xlab("Longitude") + 
  ylab("Residuals") +
  ggtitle("from best lamba model")
library(patchwork)
(yearResidMod_bestLambda ) / 
(  latResidMod_bestLambda ) /
(  longResidMod_bestLambda )
} else {

# plot residuals against Year
yearResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = jitter(Year), y = resid), alpha = .1) + 
  geom_smooth(aes(x = Year, y = resid)) + 
  xlab("Year") + 
  ylab("Residuals") +
  ggtitle("from best lamba model")
yearResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = jitter(Year), y = resid), alpha = .1) + 
  geom_smooth(aes(x = Year, y = resid)) + 
  xlab("Year") + 
  ylab("Residuals") +
  ggtitle(paste0("from ", name_secondBestMod))

# plot residuals against Lat
latResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = y, y = resid), alpha = .1) + 
  geom_smooth(aes(x = y, y = resid)) + 
  xlab("Latitude") + 
  ylab("Residuals") +
  ggtitle("from best lamba model")
latResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = y, y = resid), alpha = .1) + 
  geom_smooth(aes(x = y, y = resid)) + 
  xlab("Latitude") + 
  ylab("Residuals") +
  ggtitle(paste0("from ", name_secondBestMod))

# plot residuals against Long
longResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = x, y = resid), alpha = .1) + 
  geom_smooth(aes(x = x, y = resid)) + 
  xlab("Longitude") + 
  ylab("Residuals") +
  ggtitle("from best lamba model")
longResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = x, y = resid), alpha = .1) + 
  geom_smooth(aes(x = x, y = resid)) + 
  xlab("Longitude") + 
  ylab("Residuals") +
  ggtitle(paste0("from ", name_secondBestMod))

library(patchwork)
(yearResidMod_bestLambda + yearResidMod_1seLambda) / 
(  latResidMod_bestLambda + latResidMod_1seLambda) /
(  longResidMod_bestLambda + longResidMod_1seLambda)
}

Quantile plots

Binning predictor variables into “Quantiles” and looking at the mean predicted probability for each percentile.

# get deciles for best lambda model 
if (length(prednames_fig) == 0) {
  print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles <- predvars2deciles(pred_glm1,
                                      response_vars = response_vars,
                                        pred_vars = prednames_fig, 
                                       cut_points = seq(0, 1, 0.01))
}
# get deciles for 1 SE lambda model 
if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")} else {
  pred_glm1_deciles_1se <- predvars2deciles(pred_glm1_1se,
                                      response_vars = response_vars,
                                        pred_vars = prednames_secondBestMod, 
                                       cut_points = seq(0, 1, 0.01))
  }

Below are quantile plots for the best lambda model (note that the predictor variables are scaled)

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {

# publication quality version
g3 <- decile_dotplot_pq(df = pred_glm1_deciles, response = c(response, paste0(response, "_pred")), IQR = TRUE,
                        CI = FALSE
                        ) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles, response = c(response, paste0(response, "_pred")), dfRaw = pred_glm1, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  # png(paste0("../../../Figures/CoverDatFigures/ figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
  #    units = "in", res = 600, width = 5.5, height = 3.5 )
  #   print(g4)
  # dev.off()
}

g4
}

Below are percentile plots from the second best lambda model ()

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
  } else {

# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles_1se, response = c(response, paste0(response, "_pred")), IQR = TRUE) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles_1se, response = c(response, paste0(response, "_pred")), dfRaw = pred_glm1_1se, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  # png(paste0("figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
  #    units = "in", res = 600, width = 5.5, height = 3.5 )
  #   print(g4)
  # dev.off()
}

g4
}

Filtered Quantiles

For the best lambda model

Filtered ‘Quantile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower 20th percentiles of each predictor variable.

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1, 
                         response_vars = c(response, paste0(response, "_pred")),
                         pred_vars = prednames_fig,
                         filter_var = TRUE,
                         filter_vars = prednames_fig,
                         cut_points = seq(0, 1, 0.01)) 
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt, response = response, 
                                xvars = prednames_fig)

if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
#      units = "in", res = 600, width = 5.5, height = 6 )
#   g 
# dev.off()
}
g
}
## Processed 7517 groups out of 39197. 19% done. Time elapsed: 3s. ETA: 12s.Processed 10273 groups out of 39197. 26% done. Time elapsed: 4s. ETA: 11s.Processed 13067 groups out of 39197. 33% done. Time elapsed: 5s. ETA: 10s.Processed 15603 groups out of 39197. 40% done. Time elapsed: 6s. ETA: 9s.Processed 17975 groups out of 39197. 46% done. Time elapsed: 7s. ETA: 8s.Processed 20565 groups out of 39197. 52% done. Time elapsed: 8s. ETA: 7s.Processed 23330 groups out of 39197. 60% done. Time elapsed: 9s. ETA: 6s.Processed 25826 groups out of 39197. 66% done. Time elapsed: 10s. ETA: 5s.Processed 28341 groups out of 39197. 72% done. Time elapsed: 11s. ETA: 4s.Processed 30860 groups out of 39197. 79% done. Time elapsed: 12s. ETA: 3s.Processed 33450 groups out of 39197. 85% done. Time elapsed: 13s. ETA: 2s.Processed 35927 groups out of 39197. 92% done. Time elapsed: 14s. ETA: 1s.Processed 38444 groups out of 39197. 98% done. Time elapsed: 15s. ETA: 0s.Processed 39197 groups out of 39197. 100% done. Time elapsed: 15s. ETA: 0s.

Filtered quantile figure with middle 2 deciles also shown

if ( name_secondBestMod == "Intercept_Only") {
  print("The next best lambda model only contains one predictor (an intercept)")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1, 
                         response_vars = c(response, paste0(response, "_pred")),
                         pred_vars = prednames_fig,
                         filter_vars = prednames_fig,
                         filter_var = TRUE,
                         add_mid = TRUE,
                         cut_points = seq(0, 1, 0.01))

g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, response = response, 
                                xvars = prednames_fig)

if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
#      units = "in", res = 600, width = 5.5, height = 6)
#   g 
# dev.off()
}
g
}
## Processed 5767 groups out of 58796. 10% done. Time elapsed: 3s. ETA: 27s.Processed 8035 groups out of 58796. 14% done. Time elapsed: 4s. ETA: 25s.Processed 10406 groups out of 58796. 18% done. Time elapsed: 5s. ETA: 23s.Processed 12761 groups out of 58796. 22% done. Time elapsed: 6s. ETA: 21s.Processed 15113 groups out of 58796. 26% done. Time elapsed: 7s. ETA: 20s.Processed 17476 groups out of 58796. 30% done. Time elapsed: 8s. ETA: 18s.Processed 19858 groups out of 58796. 34% done. Time elapsed: 9s. ETA: 17s.Processed 22223 groups out of 58796. 38% done. Time elapsed: 10s. ETA: 16s.Processed 24582 groups out of 58796. 42% done. Time elapsed: 11s. ETA: 15s.Processed 26942 groups out of 58796. 46% done. Time elapsed: 12s. ETA: 14s.Processed 29303 groups out of 58796. 50% done. Time elapsed: 13s. ETA: 13s.Processed 31664 groups out of 58796. 54% done. Time elapsed: 14s. ETA: 11s.Processed 34028 groups out of 58796. 58% done. Time elapsed: 15s. ETA: 10s.Processed 36391 groups out of 58796. 62% done. Time elapsed: 16s. ETA: 9s.Processed 38696 groups out of 58796. 66% done. Time elapsed: 17s. ETA: 8s.Processed 41037 groups out of 58796. 70% done. Time elapsed: 18s. ETA: 7s.Processed 43340 groups out of 58796. 74% done. Time elapsed: 19s. ETA: 6s.Processed 45616 groups out of 58796. 78% done. Time elapsed: 20s. ETA: 5s.Processed 47963 groups out of 58796. 82% done. Time elapsed: 21s. ETA: 4s.Processed 50306 groups out of 58796. 86% done. Time elapsed: 22s. ETA: 3s.Processed 52662 groups out of 58796. 90% done. Time elapsed: 23s. ETA: 2s.Processed 55021 groups out of 58796. 94% done. Time elapsed: 24s. ETA: 1s.Processed 57261 groups out of 58796. 97% done. Time elapsed: 25s. ETA: 0s.Processed 58796 groups out of 58796. 100% done. Time elapsed: 25s. ETA: 0s.

For the second best lambda model ()

Filtered ‘Quantile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower 20th percentiles of each predictor variable.

if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1_1se, 
                         response_vars = c(response, paste0(response, "_pred")),
                         pred_vars = prednames_secondBestMod,
                         filter_var = TRUE,
                         filter_vars = prednames_secondBestMod,
                         cut_points = seq(0, 1, 0.01)) 
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt, response = response, 
                                xvars = prednames_fig)

if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
#      units = "in", res = 600, width = 5.5, height = 6 )
#   g 
# dev.off()
}
g
}
## Processed 3885 groups out of 16197. 24% done. Time elapsed: 3s. ETA: 9s.Processed 6550 groups out of 16197. 40% done. Time elapsed: 4s. ETA: 5s.Processed 9420 groups out of 16197. 58% done. Time elapsed: 5s. ETA: 3s.Processed 12070 groups out of 16197. 75% done. Time elapsed: 6s. ETA: 2s.Processed 14898 groups out of 16197. 92% done. Time elapsed: 7s. ETA: 0s.Processed 16197 groups out of 16197. 100% done. Time elapsed: 7s. ETA: 0s.

Filtered quantile figure with middle 2 deciles also shown

if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1, 
                         response_vars = c(response, paste0(response, "_pred")),
                         pred_vars = prednames_secondBestMod,
                         filter_vars = prednames_secondBestMod,
                         filter_var = TRUE,
                         add_mid = TRUE,
                         cut_points = seq(0, 1, 0.01))

g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, response = response, 
                                xvars = prednames_fig)


if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
#      units = "in", res = 600, width = 5.5, height = 6 )
#   g 
# dev.off()
}
g
}
## Processed 5795 groups out of 24296. 24% done. Time elapsed: 3s. ETA: 9s.Processed 8011 groups out of 24296. 33% done. Time elapsed: 4s. ETA: 8s.Processed 10244 groups out of 24296. 42% done. Time elapsed: 5s. ETA: 6s.Processed 12480 groups out of 24296. 51% done. Time elapsed: 6s. ETA: 5s.Processed 14732 groups out of 24296. 61% done. Time elapsed: 7s. ETA: 4s.Processed 17027 groups out of 24296. 70% done. Time elapsed: 8s. ETA: 3s.Processed 19763 groups out of 24296. 81% done. Time elapsed: 9s. ETA: 2s.Processed 22279 groups out of 24296. 92% done. Time elapsed: 10s. ETA: 0s.Processed 24296 groups out of 24296. 100% done. Time elapsed: 11s. ETA: 0s.

Show model RMSE w/in each quantile

# get deciles for best lambda model 
if (length(prednames_fig) == 0) {
  print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles %>% 
    ggplot(aes(x = mean_value, y = RMSE)) +
    facet_wrap(~name, scales = "free_x") +
    geom_point(alpha = .2, size = .5) + 
    geom_smooth(lwd = .5) + 
    xlab("Scaled predictor value") + 
    ggtitle("RMSE by decile for bestLambda model")
}

# get deciles for 1 SE lambda model 
if (length(prednames_secondBestMod) == 0) {
  print("The 1SE (or 1/2 SE) lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles_1se %>% 
    ggplot(aes(x = mean_value, y = RMSE)) +
    facet_wrap(~name, scales = "free_x") +
    geom_point(alpha = .2, size = .5) + 
    geom_smooth(lwd = .5) + 
    xlab("Scaled predictor value") + 
    ggtitle(paste0("RMSE by decile for ", name_secondBestMod, "model"))
}

Cross-validation

Using best lambda model

Use terms from global model to re-fit and predict on different held out regions

Figures show residuals for each of the models fit to held-out ecoregions

These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {

## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1_s$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
#                        pred_opt = numeric(), pred_null = numeric()#,
#                        #pred_nopenalty = numeric()
#                        )

## get the model specification from the global model
mat <- as.matrix(coef(fit_glm_bestLambda, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]

f_cv <- as.formula(paste0(response, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))

X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response])


# now, loop through so with each iteration, a different ecoregion is held out
 for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){

  # split into training and test sets
  test_eco <- i_eco
  print(test_eco)
  # identify the rowID of observations to be in the training and test datasets
  train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
  test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'

  trainDat_all <- modDat_1_s %>%
    slice(train) %>%
    dplyr::select(-newRegion)
  testDat_all <- modDat_1_s %>%
    slice(test) %>%
    dplyr::select(-newRegion)

  # get the model matrices for input and response variables for cross validation model specification
  X_train <- as.matrix(X_cv[train,])
  X_test <- as.matrix(X_cv[test,])

  y_train <- modDat_1_s[train,response]
  y_test <- modDat_1_s[test,response]

  # get the model matrices for input and response variables for original model specification
  X_train_glob <- as.matrix(X[train,])
  X_test_glob <- as.matrix(X[test,])

  y_train_glob <- modDat_1_s[train,response]
  y_test_glob <- modDat_1_s[test,response]

  train_eco <- modDat_1_s$NA_L2NAME[train]

  ## just try a regular glm w/ the components from the global model
  fit_i <- glm(data = trainDat_all, formula = f_cv,
    ,
               family =  quasibeta_family(link = "logit")
    )
    
  # lasso model predictions with the optimal lambda (back transformed)
  optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response") 
  # null model and predictions
  # the "null" model in this case is the global model
  # predict on the test data for this iteration w/ the global model (back transformed)
  null_pred <- predict.glm(fit_glm_bestLambda, newdata = testDat_all, type = "response") 


  # save data
  tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),
                    obs=y_test,
                    pred_opt=optimal_pred,
                    pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ) %>%
    cbind(testDat_all)

  # calculate RMSE, bias, etc. of
  # RMSE of CV model
  RMSE_optimal <- yardstick::rmse(data = data.frame(optimal_pred,"y_test" = (y_test)), truth = "y_test", estimate = "optimal_pred")[1,]$.estimate
  # RMSE of global model
  RMSE_null <- yardstick::rmse(data = data.frame(null_pred,"y_test" = (y_test)), truth = "y_test", estimate = "null_pred")[1,]$.estimate
  # bias of CV model
  bias_optimal <- mean((y_test) - optimal_pred)
  # bias of global model
  bias_null <-  mean((y_test) - null_pred )

  # put output into a list
  tmpList <- list("testRegion" = i_eco,
    "modelObject" = fit_i,
       "modelPredictions" = tmp,
    "performanceMetrics" = data.frame("RMSE_cvModel" = RMSE_optimal,
                                      "RMSE_globalModel" = RMSE_null,
                                      "bias_cvModel" = bias_optimal,
                                      "bias_globalModel" = bias_null))

  # save model outputs
  outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
 }
}
## [1] "COLD DESERTS"
## [1] "MARINE WEST COAST FOREST"
## [1] "MEDITERRANEAN CALIFORNIA"
## [1] "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"
## [1] "MIXED WOOD PLAINS"
## [1] "MIXED WOOD SHIELD"
## [1] "OZARK/OUACHITA-APPALACHIAN FORESTS"
## [1] "SOUTH CENTRAL SEMIARID PRAIRIES"
## [1] "SOUTHEASTERN USA PLAINS"
## [1] "TAMAULIPAS-TEXAS SEMIARID PLAIN"
## [1] "TEMPERATE PRAIRIES"
## [1] "TEXAS-LOUISIANA COASTAL PLAIN"
## [1] "UPPER GILA MOUNTAINS"
## [1] "WARM DESERTS"
## [1] "WEST-CENTRAL SEMIARID PRAIRIES"
## [1] "WESTERN CORDILLERA"
## [1] "WESTERN SIERRA MADRE PIEDMONT"

Below are the RMSE and bias values for predictions made for each holdout level II ecoregion, compared to predictions from the global model for that same ecoregion

# table of model performance
purrr::map(outList, .f = function(x) {
  cbind(data.frame("holdout region" = x$testRegion),  x$performanceMetrics)
}
) %>%
  purrr::list_rbind() %>%
  kable(col.names = c("Held-out ecoregion", "RMSE of CV model", "RMSE of global model",
                      "bias of CV model - mean(obs-pred.)", "bias of global model- mean(obs-pred.)"),
        caption = "Performance of Cross Validation using 'best lambda' model specification") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Performance of Cross Validation using ‘best lambda’ model specification
Held-out ecoregion RMSE of CV model RMSE of global model bias of CV model - mean(obs-pred.) bias of global model- mean(obs-pred.)
COLD DESERTS 0.1341145 0.1156436 0.0514421 0.0051384
MARINE WEST COAST FOREST 0.1675179 0.1031879 -0.0936344 -0.0435069
MEDITERRANEAN CALIFORNIA 0.0665148 0.0471299 -0.0462188 -0.0151416
MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS 0.0064697 0.0064429 -0.0049581 -0.0049229
MIXED WOOD PLAINS 0.0052410 0.0052404 0.0008632 0.0008587
MIXED WOOD SHIELD 0.0054141 0.0054122 -0.0041849 -0.0041824
OZARK/OUACHITA-APPALACHIAN FORESTS 0.0073440 0.0073376 0.0023301 0.0023078
SOUTH CENTRAL SEMIARID PRAIRIES 0.0604915 0.0538051 0.0224365 0.0086809
SOUTHEASTERN USA PLAINS 0.0149501 0.0145809 -0.0061987 -0.0055603
TAMAULIPAS-TEXAS SEMIARID PLAIN 0.0490835 0.0480654 -0.0134496 -0.0097438
TEMPERATE PRAIRIES 0.0108798 0.0106000 -0.0027802 -0.0027491
TEXAS-LOUISIANA COASTAL PLAIN 0.0346868 0.0332554 -0.0147596 -0.0125563
UPPER GILA MOUNTAINS 0.1198174 0.1195319 0.0080847 0.0078781
WARM DESERTS 0.1590506 0.1207981 0.0346470 -0.0018045
WEST-CENTRAL SEMIARID PRAIRIES 0.0648978 0.0542556 -0.0363991 -0.0117339
WESTERN CORDILLERA 0.0827635 0.0816384 -0.0001854 -0.0029051
WESTERN SIERRA MADRE PIEDMONT 0.0925131 0.0912268 -0.0328039 -0.0292711
# visualize model predictions
for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
  holdoutRegion <- outList[[i]]$testRegion
  predictionData <- outList[[i]]$modelPredictions
  modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
    as.data.frame() %>%
    filter(V1!=0) %>%
    rownames()

  # calculate residuals
  predictionData <- predictionData %>%
  mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
         resid_globMod = .[["obs"]]  - .[["pred_null"]])


# rasterize
# use 'test_rast' from earlier

  # rasterize data
plotObs <- predictionData %>%
         drop_na(paste(response)) %>%
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("x", "y")) %>%
  terra::set.crs(crs("EPSG:4326")) %>%
  terra::project(crs(test_rast)) %>%
  terra::rasterize(y = test_rast,
                   field = "resid",
                   fun = mean) #%>%
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

tempExt <- crds(plotObs, na.rm = TRUE)
plotObs_2 <- plotObs %>%  
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2]))
       )

# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>%
  filter(resid > .5)  %>%
  terra::vect(geom = c("x", "y")) %>%
  terra::set.crs(crs("EPSG:4326")) %>%
  terra::project(crs(test_rast))
badResids_low <- predictionData %>%
  filter(resid < -.5)  %>%
  terra::vect(geom = c("x", "y")) %>%
  terra::set.crs(crs("EPSG:4326")) %>%
  terra::project(crs(test_rast)) 

# make figures
# make histogram
hist_i <- ggplot(predictionData) +
  geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
  xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <-  ggplot() +
geom_spatraster(data = plotObs_2) +
  geom_sf(data=cropped_states, fill=NA ) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
     subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" ,
                       midpoint = 0,   na.value = "grey20",
                       limits = c(-1, 1)) +
  xlim(c(min(tempExt[,1]), max(tempExt[,1]))) + 
  ylim(c(min(tempExt[,2]), max(tempExt[,2])))

 assign(paste0("residPlot_",holdoutRegion),
   value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)

}

  lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
    get(paste0("residPlot_", x))
  })
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

Using second best lambda model (either 1se or 1/2se)

Use terms from global model to re-fit and predict on different held out regions

Figures show residuals for each of the models fit to held-out ecoregions

These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion

if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {

## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1_s$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
#                        pred_opt = numeric(), pred_null = numeric()#,
#                        #pred_nopenalty = numeric()
#                        )

## get the model specification from the global model
mat <- as.matrix(coef(mod_secondBest, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]

f_cv <- as.formula(paste0(response, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))

X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response])


# now, loop through so with each iteration, a different ecoregion is held out
 for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){

  # split into training and test sets
  test_eco <- i_eco
  print(test_eco)
  # identify the rowID of observations to be in the training and test datasets
  train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
  test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'

  trainDat_all <- modDat_1_s %>%
    slice(train) %>%
    dplyr::select(-newRegion)
  testDat_all <- modDat_1_s %>%
    slice(test) %>%
    dplyr::select(-newRegion)

  # get the model matrices for input and response variables for cross validation model specification
  X_train <- as.matrix(X_cv[train,])
  X_test <- as.matrix(X_cv[test,])

  y_train <- modDat_1_s[train,response]
  y_test <- modDat_1_s[test,response]

  # get the model matrices for input and response variables for original model specification
  X_train_glob <- as.matrix(X[train,])
  X_test_glob <- as.matrix(X[test,])

  y_train_glob <- modDat_1_s[train,response]
  y_test_glob <- modDat_1_s[test,response]

  train_eco <- modDat_1_s$NA_L2NAME[train]

  ## just try a regular glm w/ the components from the global model
  fit_i <- glm(data = trainDat_all, formula = f_cv,
    ,
               family =  quasibeta_family(link = "logit")
    )

    coef(fit_i)

  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response") 
  # null model and predictions
  # the "null" model in this case is the global model
  # predict on the test data for this iteration w/ the global model
  null_pred <- predict.glm(mod_secondBest, newdata = testDat_all, type = "response") 

  # save data
  tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),
                    obs=y_test,
                    pred_opt=optimal_pred,
                    pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ) %>%
    cbind(testDat_all)

  # calculate RMSE, bias, etc. of
  # RMSE of CV model
  RMSE_optimal <- yardstick::rmse(data = data.frame(optimal_pred, "y_test" = (y_test)), truth = "y_test", estimate = "optimal_pred")[1,]$.estimate
  # RMSE of global model
  RMSE_null <- yardstick::rmse(data = data.frame(null_pred,  "y_test" = (y_test)), truth = "y_test", estimate = "null_pred")[1,]$.estimate
  # bias of CV model
  bias_optimal <- mean((y_test) - optimal_pred)
  # bias of global model
  bias_null <-  mean((y_test) - null_pred )

  # put output into a list
  tmpList <- list("testRegion" = i_eco,
    "modelObject" = fit_i,
       "modelPredictions" = tmp,
    "performanceMetrics" = data.frame("RMSE_cvModel" = RMSE_optimal,
                                      "RMSE_globalModel" = RMSE_null,
                                      "bias_cvModel" = bias_optimal,
                                      "bias_globalModel" = bias_null))

  # save model outputs
  outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
 }
}
## [1] "COLD DESERTS"
## [1] "MARINE WEST COAST FOREST"
## [1] "MEDITERRANEAN CALIFORNIA"
## [1] "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"
## [1] "MIXED WOOD PLAINS"
## [1] "MIXED WOOD SHIELD"
## [1] "OZARK/OUACHITA-APPALACHIAN FORESTS"
## [1] "SOUTH CENTRAL SEMIARID PRAIRIES"
## [1] "SOUTHEASTERN USA PLAINS"
## [1] "TAMAULIPAS-TEXAS SEMIARID PLAIN"
## [1] "TEMPERATE PRAIRIES"
## [1] "TEXAS-LOUISIANA COASTAL PLAIN"
## [1] "UPPER GILA MOUNTAINS"
## [1] "WARM DESERTS"
## [1] "WEST-CENTRAL SEMIARID PRAIRIES"
## [1] "WESTERN CORDILLERA"
## [1] "WESTERN SIERRA MADRE PIEDMONT"

Below are the RMSE and bias values for predictions made for each holdout level II ecoregion, compared to predictions from the global model for that same ecoregion

if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
# table of model performance
purrr::map(outList, .f = function(x) {
  cbind(data.frame("holdout region" = x$testRegion),  x$performanceMetrics)
}
) %>%
  purrr::list_rbind() %>%
  kable(col.names = c("Held-out ecoregion", "RMSE of CV model", "RMSE of global model",
                      "bias of CV model - mean(obs-pred.)", "bias of global model - mean(obs-pred.)"),
        caption = "Performance of Cross Validation using 'second best lambda' model specification") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
}
Performance of Cross Validation using ‘second best lambda’ model specification
Held-out ecoregion RMSE of CV model RMSE of global model bias of CV model - mean(obs-pred.) bias of global model - mean(obs-pred.)
COLD DESERTS 0.1310020 0.1172244 0.0452450 0.0068792
MARINE WEST COAST FOREST 0.1918065 0.1142412 -0.1053888 -0.0490029
MEDITERRANEAN CALIFORNIA 0.0495105 0.0447896 -0.0222773 -0.0090404
MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS 0.0118888 0.0117899 -0.0111547 -0.0110492
MIXED WOOD PLAINS 0.0049782 0.0049781 0.0005146 0.0005126
MIXED WOOD SHIELD 0.0053059 0.0053045 -0.0039231 -0.0039212
OZARK/OUACHITA-APPALACHIAN FORESTS 0.0070648 0.0070633 0.0011134 0.0011027
SOUTH CENTRAL SEMIARID PRAIRIES 0.0599564 0.0549794 0.0180652 0.0069678
SOUTHEASTERN USA PLAINS 0.0151373 0.0146685 -0.0062789 -0.0055413
TAMAULIPAS-TEXAS SEMIARID PLAIN 0.0481298 0.0479968 -0.0050586 -0.0038946
TEMPERATE PRAIRIES 0.0089918 0.0087805 -0.0027290 -0.0025498
TEXAS-LOUISIANA COASTAL PLAIN 0.0331154 0.0319326 -0.0136042 -0.0110371
UPPER GILA MOUNTAINS 0.1197774 0.1194956 0.0004819 0.0009192
WARM DESERTS 0.1436978 0.1238005 0.0105571 -0.0060685
WEST-CENTRAL SEMIARID PRAIRIES 0.0671029 0.0553692 -0.0391270 -0.0110658
WESTERN CORDILLERA 0.0824883 0.0816237 0.0003920 -0.0038804
WESTERN SIERRA MADRE PIEDMONT 0.0909841 0.0901610 -0.0280437 -0.0253318
if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
  holdoutRegion <- outList[[i]]$testRegion
  predictionData <- outList[[i]]$modelPredictions
  modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
    as.data.frame() %>%
    filter(V1!=0) %>%
    rownames()

  # calculate residuals
  predictionData <- predictionData %>%
  mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
         resid_globMod = .[["obs"]]  - .[["pred_null"]])

  # rasterize data
plotObs <- predictionData %>%
         drop_na(paste(response)) %>%
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("x", "y")) %>%
  terra::set.crs(crs("EPSG:4326")) %>%
  terra::project(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast,
                   field = "resid",
                   fun = mean) #%>%
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

tempExt <- crds(plotObs, na.rm = TRUE)

plotObs_2 <- plotObs %>%
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2]))
       )

# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>%
  filter(resid > .5)  %>%
  terra::vect(geom = c("x", "y")) %>%
  terra::set.crs(crs("EPSG:4326")) %>%
  terra::project(crs(test_rast)) 
badResids_low <- predictionData %>%
  filter(resid < -.5)  %>%
  terra::vect(geom = c("x", "y")) %>% 
  terra::set.crs(crs("EPSG:4326")) %>%
  terra::project(crs(test_rast)) 


# make figures
# make histogram
hist_i <- ggplot(predictionData) +
  geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
  xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <-  ggplot() +
geom_spatraster(data = plotObs_2) +
  geom_sf(data=cropped_states,fill=NA ) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
     subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" ,
                       midpoint = 0,   na.value = "grey20",
                       limits = c(-1, 1))  +
  xlim(st_bbox(plotObs_2)[c(1,3)]) +
  ylim(st_bbox(plotObs_2)[c(2,4)])

 assign(paste0("residPlot_",holdoutRegion),
   value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)

}

  lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
    get(paste0("residPlot_", x))
  })
}
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

Save output

## save the coefficients for the models (best lambda, 1/2se lambda, 1se lambda)
if(trimAnom == TRUE) {
saveRDS(coefs, file = paste0("./models/modelCoefficients_trimAnom_", ecoregion, "_", response,"_removeAnomalies", removeAllAnoms,".rds"))
saveRDS(uniqueCoeffs, file = paste0("./models/modelMetrics_trimAnom_", ecoregion, "_", response,"_removeAnomalies", removeAllAnoms,".rds"))
} else {
saveRDS(coefs, file = paste0("./models/modelCoefficients_", ecoregion, "_", response,"_removeAnomalies", removeAllAnoms,".rds"))
saveRDS(uniqueCoeffs, file = paste0("./models/modelMetrics_", ecoregion, "_", response,"_removeAnomalies", removeAllAnoms,".rds"))
}
# make a table
## partial dependence plots
#vip::vip(mod_glmFinal, num_features = 15)

#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)

#caret::varImp(fit)

session info

Hash of current commit (i.e. to ID the version of the code used)

system("git rev-parse HEAD", intern=TRUE)
## [1] "95e228db41b746742f867818bc949f28170da54a"

Packages etc.

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.6.2               doMC_1.3.8                 iterators_1.0.14           foreach_1.5.2             
##  [5] factoextra_1.0.7           USA.state.boundaries_1.0.1 glmnet_4.1-10              Matrix_1.7-4              
##  [9] kableExtra_1.4.0           rsample_1.3.2              here_1.0.2                 StepBeta_2.1.0            
## [13] ggtext_0.1.2               gridExtra_2.3              pdp_0.8.3                  GGally_2.4.0              
## [17] lubridate_1.9.4            forcats_1.0.1              stringr_1.6.0              dplyr_1.2.0               
## [21] purrr_1.2.1                readr_2.1.6                tidyr_1.3.2                tibble_3.3.1              
## [25] tidyverse_2.0.0            caret_7.0-1                lattice_0.22-7             ggplot2_4.0.2             
## [29] sf_1.0-24                  tidyterra_1.0.0            terra_1.8-93               ggspatial_1.1.10          
## [33] betareg_3.2-4              dtplyr_1.3.2               patchwork_1.3.2           
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3   rstudioapi_0.18.0    jsonlite_2.0.0       shape_1.4.6.1        magrittr_2.0.4       modeltools_0.2-24   
##   [7] farver_2.1.2         rmarkdown_2.30       vctrs_0.7.1          rstatix_0.7.3        htmltools_0.5.9      broom_1.0.12        
##  [13] Formula_1.2-5        pROC_1.19.0.1        sass_0.4.10          parallelly_1.46.1    KernSmooth_2.23-26   bslib_0.10.0        
##  [19] plyr_1.8.9           sandwich_3.1-1       zoo_1.8-15           cachem_1.1.0         commonmark_2.0.0     lifecycle_1.0.5     
##  [25] pkgconfig_2.0.3      R6_2.6.1             fastmap_1.2.0        future_1.69.0        digest_0.6.39        furrr_0.3.1         
##  [31] rprojroot_2.1.1      textshaping_1.0.4    labeling_0.4.3       yardstick_1.3.2      timechange_0.4.0     mgcv_1.9-4          
##  [37] abind_1.4-8          compiler_4.5.2       proxy_0.4-29         aod_1.3.3            withr_3.0.2          S7_0.2.1            
##  [43] backports_1.5.0      carData_3.0-6        DBI_1.2.3            ggstats_0.12.0       ggsignif_0.6.4       MASS_7.3-65         
##  [49] lava_1.8.2           classInt_0.4-11      gtools_3.9.5         ModelMetrics_1.2.2.2 tools_4.5.2          units_1.0-0         
##  [55] lmtest_0.9-40        otel_0.2.0           future.apply_1.20.1  nnet_7.3-20          glue_1.8.0           nlme_3.1-168        
##  [61] gridtext_0.1.5       grid_4.5.2           reshape2_1.4.5       generics_0.1.4       recipes_1.3.1        gtable_0.3.6        
##  [67] tzdb_0.5.0           class_7.3-23         data.table_1.18.2.1  hms_1.1.4            car_3.1-5            xml2_1.5.2          
##  [73] flexmix_2.3-20       markdown_2.0         ggrepel_0.9.6        pillar_1.11.1        splines_4.5.2        survival_3.8-6      
##  [79] tidyselect_1.2.1     knitr_1.51           litedown_0.9         svglite_2.2.2        stats4_4.5.2         xfun_0.56           
##  [85] hardhat_1.4.2        timeDate_4052.112    stringi_1.8.7        yaml_2.3.12          evaluate_1.0.5       codetools_0.2-20    
##  [91] cli_3.6.5            rpart_4.1.24         systemfonts_1.3.1    jquerylib_0.1.4      Rcpp_1.1.1           globals_0.19.0      
##  [97] gower_1.0.2          listenv_0.10.0       viridisLite_0.4.2    ipred_0.9-15         scales_1.4.0         prodlim_2025.04.28  
## [103] e1071_1.7-17         combinat_0.0-8       rlang_1.1.7          cowplot_1.2.0